Orla O’Shea PSY6422 Project

Project Script

Below is the final script with relevant narrative that produces the heatmap visualisation:

library(dplyr) 

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyr) #Both packages allowed me to clean the data before visualising it.

#I had to remove the first few headings to make sure the data read correctly,
#and then separated the variable columns from the data as they had merged together.
df <- read.csv("gcses-and-equivalent-in-english-and-maths-table-data.csv", 
               skip = 7, 
               header = FALSE, 
               quote = "\"", 
               sep = ",")
names(df) <- c("Area code","Area name","2016-17","2017-18","2018-19", "2019-20","2020-21","2021-22","2022-23","2023-24")

#Accounting for a space mark in one of the headings.
df <- df %>% filter(`Area code` != "Area code")

#Had to separate the data for each year which had also merged together.
df_long <- df %>% 
  pivot_longer( 
    cols = `2016-17`:`2023-24`, 
    names_to = "Year", 
    values_to = "Value")

#Converted from a wide to a long dataset so that I could demonstrate change over time,
#rather than focus on regional trends.
#The numbers were originally down as character rather than numeric data, and cleaned 
#the data to remove N/A values that weren't converted.
df_long <- df_long %>% mutate(Value = as.numeric(Value))
df_long_clean <- df_long %>% 
  mutate(Value = as.numeric(Value)) %>% 
  filter(!is.na(Value))

library(ggplot2)
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.1     ✔ readr     2.1.5
✔ lubridate 1.9.4     ✔ stringr   1.5.2
✔ purrr     1.1.0     ✔ tibble    3.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly) #These packages allowed me to visualise the data and make it interactive.

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
#I wanted to create a heatmap to communicate the bigger picture, demonstrating 
#trends over time but also accounting for regional differences.
df_heatmap <- df_long_clean %>%
  mutate(`Area name` = fct_reorder(`Area name`, Value, .fun = 'mean')) %>%
#Mutated the y-axis so it would be shown from highest to lowest regional percentages
#rather than alphabetically.
  ggplot(mapping = aes(x = Year, y = `Area name`, fill = Value)) +
  geom_tile() +
  scale_fill_viridis_c(option = "magma", direction = -1) +
#Wanted the darker colours to represent higher percentages and vice versa.
  labs(title = "Attainment of GCSEs in English and Maths across England",
       x = "Year", y = "Lower-tier/Unitary Authority", fill = "Pass Rate") +
  theme(axis.text.y = element_text(size = 4))

df_heatmap

df_heatmap +
  theme_minimal() +
  theme(
    legend.position = "right",
    plot.title = element_text(size = 11, face = "bold"),
    axis.title = element_text(size = 8),
    panel.grid.minor = element_blank()
  )

#Made the heatmap longer in order to leave spaces between the y-axis variables,
#and used the plotly package to make the map interactive so you can get the specific
#percentages of qualifications for each region per academic year.
ggplotly(df_heatmap, height = 1600)